Globalno horizontalno zračenje (GHI), ili ukupna količina kratkotalasnog sunčevog zračenja koje dospe na zemljinu površinu, predstavlja ključni parametar za solarne elektrane. Ova vrednost, koja obuhvata i direktno i difuzno sunčevo zračenje, trenutno se meri geo-stacionarnim satelitima.
Predviđanje perioda sa visokim GHI indeksom omogućava solarnim elektranama proaktivni pristup optimizaciji rada. To se ogleda u:
Skup podataka odnosi se na merenje GHI zračenja na podrucju Vijetnama i javno je dostupan na adresi https://data.openei.org/s3_viewer?bucket=nrel-pds-nsrdb&prefix=vietnam%2F, koje je objavila NREL (National Solar Radiation Database) i dostupno je pod licencom Creative Commons Attribution 3.0 United States License.
install.packages("BiocManager")
BiocManager::install("rhdf5")
install.packages("kableExtra")
install.packages("ggpubr")
install.packages("sparklyr")
install.packages("caret")
install.packages("webshot2")
library(rhdf5)
library(ggplot2)
library(dplyr)
library(magrittr)
library(lubridate)
library(kableExtra)
library(ggpubr)
library(sparklyr)
library(caret)
library(webshot2)
set.seed(10)
spark_install(version="3.3.2")
Sys.setenv(JAVA_HOME="/usr/lib/jvm/java-11-openjdk")
knitr::opts_knit$set(root.dir = "/mnt/StorageSpace/StorageSpace/repositories/ghi-predicting")
# Ovim prosirujemo java heap sparka da bi mogli da smestimo nas set podataka u njega
conf <- spark_config()
conf$`sparklyr.shell.driver-memory` <- "16G"
conf$spark.memory.fraction <- 0.9
sc <- spark_connect(master = "local", version="3.3.2", config = conf)
Temperaturu vazduha i brzinu vetra takođe je moguće meriti pomoću satelita.
h5_file <- H5Fopen("./dataset/vietnam_2016.h5")
air_temp <- h5read(h5_file, "/air_temperature") # celsius
coordinates <- h5read(h5_file, "/coordinates") # angle
ghi <- h5read(h5_file, "/ghi") #W/m^2
meta <- h5read(h5_file, "/meta") # elevation: m
time_index <- h5read(h5_file, "/time_index") # date
wind_speed <- h5read(h5_file, "/wind_speed") # m/s
kable_out <- knitr::kable(h5ls(h5_file) %>%
select(name,dclass,dim),
format = "html",
) %>%
kableExtra::kable_styling(bootstrap_options = "bordered", full_width = F, font_size = 16)
save_kable(kable_out, file = "tables/hdf5_structure.png")
Zaključujemo da imamo 75361 redova zbog 75361 koordinate koje se posmatraju i svaki red sadrži onoliko kolona koliko je bilo merenja u različitim vremenskim trenucima.
Kolona ima koliko i sati koliko je posmatranje trajalo: 366 * 24 = 8784
Budući da nas zanima klasifikacija zračenja u letnjem periodu filtriramo podatke koji su izmereni od 21. juna (172. dan) do 22. septembra (266. dan).
summer_start <- 172 # day number
summer_end <- 266
summer_hours <- (summer_start * 24) : (summer_end * 24)
sunrise <- 5
sunset <- 19
sunny_hours <- summer_hours[summer_hours %% 24 %in% (sunrise + 1):(sunset + 1)]
coordinate_step <- 10
#Δlatitude = 5 degrees (differential of latitude)
#Δlongitude = 5 degrees (differential of longitude)
# Earth's average radius ≈ 6,371 kilometers
# Area = 5° * 5° * 6,371 km ≈ 159,275 square kilometers
time_measures_per_coor <- length(sunny_hours)
coor_num = nrow(air_temp)
selected_rows = seq(1, coor_num, by = coordinate_step)
coor_num <- length(selected_rows)
Kada smo zaključili opseg podataka (selected_rows, sunny_hours) za letnji period u suncem obasjanim intervalima, ekstrahujemo podatke iz h5 matrice i pretvaramo ih u tabelarni format pogodan za dalje korišćenje.
id <- 1:(coor_num * time_measures_per_coor)
latitude <- rep(coordinates[1, selected_rows], each = time_measures_per_coor)
longitude <- rep(coordinates[2, selected_rows], each = time_measures_per_coor)
elevation <- rep(meta[selected_rows,"elevation"], each = time_measures_per_coor)
rm(coordinates)
time <- rep(time_index[sunny_hours], coor_num)
air_temp <- c(air_temp[selected_rows, sunny_hours])
wind_speed <- c(wind_speed[selected_rows, sunny_hours])
ghi <- c(ghi[selected_rows, sunny_hours])
df <- data.frame(id, time, latitude, longitude, elevation, air_temp, wind_speed, ghi)
df <- df %>%
filter(ghi != 0) %>%
mutate(time = ymd_hms(strptime(time, "%Y-%m-%d %H:%M:%S")))
# closing all HDF5 handles
h5closeAll()
# removing helper arrays
rm(id, time, latitude, longitude, elevation, air_temp, wind_speed, ghi, h5_file, meta)
# removing helper values
rm(coor_num, coordinate_step, selected_rows, summer_end, summer_hours, summer_start, sunny_hours, sunrise,
sunset, time_index, time_measures_per_coor)
Sa ovih grafikona može se zaključiti da nadmorska visina nema posebnu povezanost sa bilo kojom preostalom promenljivom dok se mogu uočiti trendovi u odnosima brzine vetra, temperature vazduha i GHI.
Klasifikacija je vršena za GHI, gde bi se njegova vrednost klasifikovala kao normalna ili visoka (NORMAL, HIGH), što bi bilo od značaja elektrokompanijama za proaktivan pristup održavanju postrojenja, optimizaciji resursa i raspodeli posla.
Kao prag za visoko svetlosno zračenje, iako se generalno uzima 600 W/m², na osnovu prethodno iscrtanih grafikona, odabiramo, vrednost trećeg kvartila, kako bi se bolje prilagodili našem konkretnom skupu podataka vezanom za Tajvan.
threshold <- quantile(df$ghi, probs = 0.75)
threshold
df %<>%
mutate(ghi_c = factor(ifelse(ghi > threshold, "HIGH", "NORMAL")))
k_folds <- 5
# Promesamo podatke
df_shuffled <- df[sample(nrow(df)), ]
df_shuffled %<>%
mutate(id = 1:nrow(df)) # radi laseg samplovanja
# Prebacimo u tbl_spark zbog dalje obrade
df_spark <- copy_to(sc, df_shuffled, "df_spark", overwrite = TRUE)
# ML funkcije ne podrzavaju timestamp, pa se vreme prebacuje u numeric
df_spark %<>%
mutate(time_numeric = as.numeric(time))
formula <- ghi_c ~ air_temp + wind_speed + longitude + latitude + time_numeric
# Funkcija za deljenje podataka za k-fold validaciju
split_data <- function(data, fold_num, fold_size) {
test_start <- (fold_num - 1) * fold_size + 1
test_end <- test_start + fold_size - 1
test_set <- data %>%
filter(id >= test_start & id <= test_end)
train_set <- data %>%
filter(!(id >= test_start & id <= test_end))
return(list(train_set = train_set, test_set = test_set))
}
# Funkcija za izracunavanje performansi specifikacije
calc_perf <- function(model, test_set){
pred <- ml_predict(model, test_set)
confMat <- confusionMatrix(factor(pull(test_set, "ghi_c")), factor(pull(pred, "predicted_label")))
confMat
accuracy <- confMat$overall[["Accuracy"]]
sensitivty <- confMat$byClass[["Sensitivity"]]
specificity <- confMat$byClass[["Specificity"]]
return(list(accuracy = accuracy, sensitivty = sensitivty,
specificity = specificity))
}
# Opste potrebni podaci za visestruku validaciju
row_num <- count(df_spark) %>% collect()
row_num <- row_num[[1,1]]
fold_size <- row_num %/% k_folds
Kao brojčani pokazatelji performansi za sva 3 modela računati su:
Podešavani hiperparametri:
start_time <- proc.time()
reg_parameter <- c(0, 0.001, 0.1)
max_iterations <- c(100, 500, 1000)
accuracies <- numeric(3)
sensitivities <- numeric(3)
specificities <- numeric(3)
for(scenario in 1:3){
accuracy_sum <- 0
sensitivity_sum <- 0
specificity_sum <- 0
for(i in 1:k_folds){
paritioned_data <- split_data(df_spark, i, fold_size)
logreg <- ml_logistic_regression(paritioned_data$train_set,
formula,
reg_param = reg_parameter[scenario],
max_iter = max_iterations[scenario],
family = "binomial")
performance <- calc_perf(logreg, paritioned_data$test_set)
accuracy_sum <- accuracy_sum + performance[["accuracy"]]
sensitivity_sum <- sensitivity_sum + performance[["sensitivty"]]
specificity_sum <- specificity_sum + performance[["specificity"]]
}
accuracies[scenario] <- accuracy_sum / k_folds
sensitivities[scenario] <- sensitivity_sum / k_folds
specificities[scenario] <- specificity_sum / k_folds
}
# Prikaz rezultata
results_logreg <- data.frame(accuracies,
sensitivities,
specificities)
rownames(results_logreg) <- c("Scenario 1", "Scenario 2", "Scenario 3")
colnames(results_logreg) <- c("Accuracy", "Sensitivity", "Specificity")
kable_out <- knitr::kable(results_logreg,
label = "Prikaz rezultata logisticke regresije",
format = "html"
) %>%
kableExtra::kable_styling(bootstrap_options = "bordered", full_width = FALSE, font_size = 16)
end_time <- proc.time()
elapsed = end_time-start_time
print(paste("Elapsed time:",round(elapsed[3] / 60,2)))
save_kable(kable_out, file = "tables/results_logreg.png")
Scenario 3 (osetljivost 0.6343) je najbolji izbor za solarne centrale. Prioritet je detektovati periode visokog zračenja, čak i uz malo nižu preciznost i više lažnih alarma, radi maksimalne proizvodnje solarne energije.
Podešavani hiperparametri:
start_time <- proc.time()
#### Obucanvanje i validacija
reg_parameter <- c(0, 0.001, 0.1)
max_iterations <- c(100, 500, 1000)
accuracies <- numeric(3)
sensitivities <- numeric(3)
specificities <- numeric(3)
for(scenario in 1:3){
accuracy_sum <- 0
sensitivity_sum <- 0
specificity_sum <- 0
for(i in 1:k_folds){
paritioned_data <- split_data(df_spark, i, fold_size)
lsvc <- ml_logistic_regression(paritioned_data$train_set,
formula,
reg_param = reg_parameter[scenario],
max_iter = max_iterations[scenario])
performance <- calc_perf(lsvc, paritioned_data$test_set)
accuracy_sum <- accuracy_sum + performance[["accuracy"]]
sensitivity_sum <- sensitivity_sum + performance[["sensitivty"]]
specificity_sum <- specificity_sum + performance[["specificity"]]
}
accuracies[scenario] <- accuracy_sum / k_folds
sensitivities[scenario] <- sensitivity_sum / k_folds
specificities[scenario] <- specificity_sum / k_folds
}
# Prikaz rezultata
results_lsvc <- data.frame(accuracies,
sensitivities,
specificities)
rownames(results_lsvc) <- c("Scenario 1", "Scenario 2", "Scenario 3")
colnames(results_lsvc) <- c("Accuracy", "Sensitivity", "Specificity")
kable_out <- knitr::kable(results_lsvc,
label = "Prikaz rezultata lsvc",
format = "html"
) %>%
kableExtra::kable_styling(bootstrap_options = "bordered", full_width = FALSE, font_size = 16)
end_time <- proc.time()
elapsed = end_time-start_time
print(paste("Elapsed time:",round(elapsed[3] / 60,2)))
save_kable(kable_out, file = "tables/results_lsvc.png")
Scenario 3 (osetljivost 0.6343) je najbolji izbor za solarne centrale. Prioritet je detektovati periode visokog zračenja, čak i uz malo nižu preciznost i više lažnih alarma, radi maksimalne proizvodnje solarne energije.
Na osnovu rezultata da se zaključiti da parametri u 2. scenariju postižu najbolje performanse.
start_time <- proc.time()
#### Obucanvanje i validacija
max_depth <- c(5,10,20)
min_instances_per_node <- c(2,5,10)
accuracies <- numeric(3)
sensitivities <- numeric(3)
specificities <- numeric(3)
for(scenario in 1:3){
accuracy_sum <- 0
sensitivity_sum <- 0
specificity_sum <- 0
for(i in 1:k_folds){
paritioned_data <- split_data(df_spark, i, fold_size)
dt <- ml_decision_tree_classifier(paritioned_data$train_set,
formula,
max_depth = max_depth[scenario],
min_instances_per_node = min_instances_per_node[scenario]
)
performance <- calc_perf(dt, paritioned_data$test_set)
accuracy_sum <- accuracy_sum + performance[["accuracy"]]
sensitivity_sum <- sensitivity_sum + performance[["sensitivty"]]
specificity_sum <- specificity_sum + performance[["specificity"]]
}
accuracies[scenario] <- accuracy_sum / k_folds
sensitivities[scenario] <- sensitivity_sum / k_folds
specificities[scenario] <- specificity_sum / k_folds
}
# Prikaz rezultata
results_dt <- data.frame(accuracies,
sensitivities,
specificities)
rownames(results_dt) <- c("Scenario 1", "Scenario 2", "Scenario 3")
colnames(results_dt) <- c("Accuracy", "Sensitivity", "Specificity")
kable_out <- knitr::kable(results_dt,
label = "Prikaz rezultata stabla odlucivanja",
format = "html"
) %>%
kableExtra::kable_styling(bootstrap_options = "bordered", full_width = FALSE, font_size = 16)
end_time <- proc.time()
elapsed = end_time-start_time
print(paste("Elapsed time:",round(elapsed[3] / 60,2)))
save_kable(kable_out, file = "tables/results_dt.png")
Na osnovu rezultata da se zaključiti da parametri u 3. scenariju postižu najbolje performanse.
Nakon analize svih rezultata na nivou svih metoda zaključujemo da najbolje performanse postiže 3. scenario metode stabla odlučivanja.
df_spark_normalized <- copy_to(sc, df_shuffled, "df_spark_normalized", overwrite = TRUE)
df_spark_normalized %<>%
mutate(time_numeric = as.numeric(time))
features <- c("air_temp", "wind_speed")
means <- numeric(length(features))
stds <- numeric(length(features))
names(means) <- features
names(stds) <- features
for(col in features){
summary_stats <- df_spark_normalized %>%
summarise(
mean_value = mean(!!sym(col), na.rm = TRUE),
std_value = sd(!!sym(col), na.rm = TRUE)
) %>%
collect()
means[col] <- summary_stats$mean_value
stds[col] <- summary_stats$std_value
}
for (col in features) {
mean <- means[col]
std <- stds[col]
df_spark_normalized <- df_spark_normalized %>%
mutate(!!sym(col) := (!!sym(col) - mean) / std)
}
Crtamo elbow plot kako bi odredili najpogodnije k za klasterizaciju.
start_time <- proc.time()
formula_clustering <- ~ air_temp + wind_speed
sum_of_squared_errors <- numeric(6)
for(k in 2:6){
print(k)
clustered <- ml_bisecting_kmeans(df_spark_normalized, formula = formula_clustering, k = k)
sum_of_squared_errors[k] <- clustered$cost
}
# Prikaz
k_errors_df <- data.frame(1:6, sum_of_squared_errors)
colnames(k_errors_df) <- c("K", "sum_of_squared_errors")
# Create the ggplot object
k_elbow_plot <- ggplot(k_errors_df) +
geom_line(mapping = aes(x = K, y = sum_of_squared_errors)) +
labs(title = "Sum of squared errors over number of clusters",
x = "Number of clusters",
y = "Sum of squared errors ") +
theme_bw() +
scale_x_continuous(breaks = seq(1, 6, by = 1)) # Set breaks from 1 to 6 with a step of 1
k_elbow_plot
end_time <- proc.time()
elapsed = end_time-start_time
print(paste("Elapsed time:",round(elapsed[3] / 60,2)))
Klasterizacija prema 2 scenarija:
Izabran je Bisecting KMeans jer:
cluster_k_3 <- ml_bisecting_kmeans(df_spark_normalized, formula = formula_clustering, k = 3)
cluster_k_4 <- ml_bisecting_kmeans(df_spark_normalized, formula = formula_clustering, k = 4)
Povezujemo klase sa nenormalizovanom tabelom
predicted_clusters_3 <- ml_predict(cluster_k_3, df_spark_normalized)
predicted_clusters_3 %<>%
select(prediction) %>%
rename("cluster_3" = prediction)
predicted_clusters_4 <- ml_predict(cluster_k_4, df_spark_normalized)
predicted_clusters_4 %<>%
select(prediction) %>%
rename("cluster_4" = prediction)
df_spark <- sdf_bind_cols(df_spark,predicted_clusters_3, predicted_clusters_4)
spark_disconnect(sc)